if(!"mvlearnR" %in% installed.packages()){
library(devtools)
devtools::install_github("lasandrall/mvlearnR")
}
library(mvlearnR)
## Warning: replacing previous import 'igraph::spectrum' by 'stats::spectrum' when
## loading 'mvlearnR'
## Warning: replacing previous import 'CVXR::power' by 'stats::power' when loading
## 'mvlearnR'
## Warning: replacing previous import 'igraph::decompose' by 'stats::decompose'
## when loading 'mvlearnR'
## Warning: replacing previous import 'stats::power' by 'CVXR::power' when loading
## 'mvlearnR'
## Warning: replacing previous import 'stats::decompose' by 'igraph::decompose'
## when loading 'mvlearnR'
## Warning: replacing previous import 'stats::spectrum' by 'igraph::spectrum' when
## loading 'mvlearnR'
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# #library(devtools)
# if (interactive()) {
# require("devtools", quietly = TRUE)
# # automatically attaches usethis
# }
# load_all()
The package mvlearnR and accompanying Shiny App is intended for integrating data from multiple sources (e.g. genomics, proteomics, metabolomics). Most existing software packages for multiview learning are decentralized, making it difficult for users to perform comprehensive integrative analysis. The new package wraps statistical and machine learning methods and graphical tools, providing a convenient and easy data integration workflow. For users with limited programming language, we provide a Shiny Application to facilitate data integration. The methods have potential to offer deeper insights into complex disease mechanisms.
The package can be downloaded from GitHub at (https://github.com/lasandrall/mvlearnR). To install the package, you need to have the R-package devtools installed. Then use the function install_github() to install the package.
We provide real data pertaining to COVID-19 severity and status and several simulated data to demonstrate the use of the package. Simulated data for two views and a binary outcome could be read as data(sidaData), data(selpData). The COVID-19 data can be imported as data(COVID). This is a list with 3 entries: Proteomics, RNASeq, and Clinical data.
#load data
data("COVIDData")
#make omics data numeric
Proteomics= apply(as.matrix(COVIDData[[1]]), 2, as.numeric)
RNASeq= apply(as.matrix(COVIDData[[2]]), 2, as.numeric)
Clinical= COVIDData[[3]]
table(Clinical$DiseaseStatus)
##
## COVID-19 non-COVID-19
## 98 22
set.seed(1234)
stratified <- Clinical %>%
group_by(DiseaseStatus) %>%
sample_frac(size = .9)
#train data
Proteomics2=cbind.data.frame(Clinical[,1], Proteomics)
Proteomics.Train=Proteomics2[Proteomics2[,1] %in% stratified$ID, ]
Clinical.Train=Clinical[Clinical[,1] %in% stratified$ID, ]
RNASeq2=cbind.data.frame(Clinical[,1], RNASeq)
RNASeq.Train=RNASeq2[RNASeq2[,1] %in% stratified$ID, ]
#test data
set.diff=setdiff(Clinical[,1],stratified$ID)
Proteomics.Test=Proteomics2[Proteomics2[,1] %in% set.diff, ]
RNASeq.Test=RNASeq2[RNASeq2[,1] %in% set.diff, ]
Clinical.Test=Clinical[Clinical[,1] %in% set.diff, ]
#order data to ensure that the rows are the same
Proteomics.Train=Proteomics.Train[order(Proteomics.Train[,1]),]
Proteomics.Test=Proteomics.Test[order(Proteomics.Test[,1]),]
RNASeq.Train=RNASeq.Train[order(RNASeq.Train[,1]),]
RNASeq.Test=RNASeq.Test[order(RNASeq.Test[,1]),]
Clinical.Train=Clinical.Train[order(Clinical.Train[,1]),]
Clinical.Test=Clinical.Test[order(Clinical.Test[,1]),]
We provide functions to filter data either using supervised or unsupervised statistical methods. The function filter.supervised() can be used to filter each view using four methods: linear, logistic, t-test, and Kruskal-Wallis (KW) test. P-values can be adjusted for multiple hypothesis testing. The function filter.unsupervised() can be used to filter each view using unsupervised methods that include: variance and interquartile range (IQR) filtering.
Results from the supervised filtering approach could be visualized via volcano plots using the function volcanoPlot(). The filtered or original data could be visualized via UMAP with the function umapPlot().
#Supervised filtering- Logistic regression with B-H adjusted pvalues
X=list(Proteomics.Train[,-1], RNASeq.Train[,-1] )
Xtest.in=list(Proteomics.Test[,-1], RNASeq.Test[,-1] ) #testing data will be subsetted to keep only variables that are significant in training set
Y=Clinical.Train$DiseaseStatus.Indicator
filterOmics=filter.supervised( X,
Y,
method = "logistic",
padjust=TRUE,
adjmethod="BH",
thresh = 0.05,
center = FALSE,
scale = FALSE,
log2TransForm = FALSE,
standardize=TRUE,
Xtest = Xtest.in
)
## [1] "Printing top 10 results for significant variables for View 1"
## Coef
## P04196 -2.302249
## P51884 -1.833268
## P30491;P30685;P30490;P18464;P10319;P30498;P30483;P18463;P18465;P30487;P30488;P30485 1.155279
## A0A0C4DFP6;Q9NQ79-2;Q9NQ79;Q9NQ79-3;Q5T4F6 -2.242493
## C9JB55 -2.553349
## P02765;C9JV77 -1.492872
## V9GYM3;P02652;V9GYE3;V9GYG9 -1.549336
## Q6UXB8;Q6UXB8-2 -1.660778
## Q96PD5;Q96PD5-2 -1.498597
## D6W5L6;P07988;H0Y7V6 1.359330
## Pval
## P04196 0.0005777604
## P51884 0.0005777604
## P30491;P30685;P30490;P18464;P10319;P30498;P30483;P18463;P18465;P30487;P30488;P30485 0.0012273746
## A0A0C4DFP6;Q9NQ79-2;Q9NQ79;Q9NQ79-3;Q5T4F6 0.0012464527
## C9JB55 0.0012464527
## P02765;C9JV77 0.0012464527
## V9GYM3;P02652;V9GYE3;V9GYG9 0.0012464527
## Q6UXB8;Q6UXB8-2 0.0013409020
## Q96PD5;Q96PD5-2 0.0014091137
## D6W5L6;P07988;H0Y7V6 0.0017111773
## Keep
## P04196 TRUE
## P51884 TRUE
## P30491;P30685;P30490;P18464;P10319;P30498;P30483;P18463;P18465;P30487;P30488;P30485 TRUE
## A0A0C4DFP6;Q9NQ79-2;Q9NQ79;Q9NQ79-3;Q5T4F6 TRUE
## C9JB55 TRUE
## P02765;C9JV77 TRUE
## V9GYM3;P02652;V9GYE3;V9GYG9 TRUE
## Q6UXB8;Q6UXB8-2 TRUE
## Q96PD5;Q96PD5-2 TRUE
## D6W5L6;P07988;H0Y7V6 TRUE
## View
## P04196 1
## P51884 1
## P30491;P30685;P30490;P18464;P10319;P30498;P30483;P18463;P18465;P30487;P30488;P30485 1
## A0A0C4DFP6;Q9NQ79-2;Q9NQ79;Q9NQ79-3;Q5T4F6 1
## C9JB55 1
## P02765;C9JV77 1
## V9GYM3;P02652;V9GYE3;V9GYG9 1
## Q6UXB8;Q6UXB8-2 1
## Q96PD5;Q96PD5-2 1
## D6W5L6;P07988;H0Y7V6 1
## [1] "Printing top 10 results for significant variables for View 2"
## Coef Pval Keep View
## ASPM 2.334767 0.0003191221 TRUE 2
## BIRC5 2.130810 0.0003191221 TRUE 2
## BUB1 2.533314 0.0003191221 TRUE 2
## BUB1B 2.368631 0.0003191221 TRUE 2
## CCNB2 2.456306 0.0003191221 TRUE 2
## CDC20 1.941254 0.0003191221 TRUE 2
## CDC25C 2.448140 0.0003191221 TRUE 2
## CDCA2 2.465166 0.0003191221 TRUE 2
## CDCA5 2.400132 0.0003191221 TRUE 2
## CDCA8 2.423595 0.0003191221 TRUE 2
We applied mvlearnR to integrate RNA sequencing and proteomics data to identify variables that maximize association between genes and proteins. For this purpose, we used the filtered data from above (i.e. filterOmics results). We also provide simulated data that could be used.
We use the SELPCCA method. CCA is a multivariate linear method for maximizing association between data from two sources. CCA finds weighted combinations of variables in each data that maximize the overall dependency structure between pairs of data. The function cvselpscca() can be used to obtain low-dimensional linear representations maximizing associations between the genes and proteins. It will allow users to select genes and proteins contributing to maximal correlation between the pairs of data. In the example below, we obtain the first and second canonical variates for each view with the option ncancorr=2 in cvselpscca().
Based on the results, there are 78 proteins with nonzero coefficients in the first canonical variate for View 1 (Proteomics data), and 32 genes for View 2 (RNASeq data). The second canonical variate has 54 proteins and 9 genes with nonzero coefficients for the proteomics and RNASeq data, respectively. The estimated canonical correlation coefficents are 0.64 and 0.6 for the first and second canonical correlation vectors, respectively.
#can use simulated data
#data("selpData")
#Xdata1 <- selpData[[1]]
#Xdata2 <- selpData[[2]]
#We use the filtered data from above and obtain first and second canonical correlation vectors
#We use cross-validation to choose tuning parameters for sparsity
Xdata1 <- filterOmics$X[[1]] #proteomics
Xdata2 <- filterOmics$X[[2]] #RNASeq
mycvselpcca=cvselpscca(Xdata1, Xdata2, ncancorr=2)
## [1] "Current iteration is 1"
## [1] "Current iteration is 2"
## [1] "Applying optimal tuning parameter on whole data"
## [1] "Current Iteration Is: 1"
## [1] "Current Iteration Is: 2"
## [1] "Number of non-zero Xdata1 or View 1: 78 54"
## [1] "Number of non-zero Xdata2 or View 2: 32 9"
## [1] "Corr(Xdata1*alpha,Xdata2*beta): 0.636 0.599"
## [1] "Sparse CCA CovStructure used is: Iden"
Our goal is to use the learned low-dimensional representation(s) to discriminate between those with and without COVID-19, or to predict COVID-19 severity (defined as hospital free days [HFD45]). Since SELPCCA is an unsupervised method, it can only be used to identify variables contributing to the maximum association between the two views.
If an outcome is available, one can associate the learned low-dimensional representation(s) with the outcome. We provide the function selpscca.pred() for this purpose. We provide the options gaussian, binomial, poisson, and survival to model continuous, categorical, count, or time-to-event data, respectively.
Application of this function using the first and second canonical variates for the genes and proteins and the outcome of COVID-19 status suggests that the first canonical variates for the proteomics and gene data are able to discriminate between those with and without COVID-19.
The function predict() can be used to predict out of sample data using the learned low-dimensional representations. The option type in predict() is the type of prediction required, and it follows the same definition in predict.glm() and predict.coxph() in R.
We use results from cvselp() to predict COVID-19 severity (defined as hospital free days [HFD45]). Note that HFD45 score of 0 indicates the highest severity and the patient was in the hospital after 45 days, or died before the 45-day period ended. A higher score indicates lower disease severity.
Results suggest that the first canonical variate for view 1 is significantly associated with hospital free days (p-value < 0.05)
#Use results from cvselpcca. One can also use selpPredict() to train the model by setting fitselpCCA=NULL.
Y.HFD45=as.numeric(Clinical.Train$HFD45)
myresult.HFD45=selpscca.pred(Xdata1, Xdata2, Y.HFD45,ncancorr=2,fitselpCCA=mycvselpcca, family="gaussian",showProgress=T)
## Fitting SELPCCA Model
## Fitting Prediction Model
print(summary(myresult.HFD45[["mod.fit"]]))
##
## Call:
## stats::lm(formula = Y ~ ., data = selp.dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.951 -7.931 0.894 9.448 25.834
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.4815 1.2854 19.046 < 2e-16 ***
## X1_1 -2.5405 0.4193 -6.059 2.27e-08 ***
## X1_2 0.9771 0.8782 1.113 0.268
## X2_1 0.6363 0.4239 1.501 0.136
## X2_2 -0.1524 0.9025 -0.169 0.866
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.36 on 103 degrees of freedom
## Multiple R-squared: 0.364, Adjusted R-squared: 0.3393
## F-statistic: 14.74 on 4 and 103 DF, p-value: 1.493e-09
#Results suggest that the first canonical variate for view 1 (proteins) is significantly associated with hospital free days (p-value < 0.05), while the others are not significantly associated with hospital free days (p-value >= 0.05).
Predict out of sample data and obtain train and test performance metrics.
##Train Performance Metrics
newpredictions=predict(myresult.HFD45, newdata=Xdata1, newdata2=Xdata2)
Y.pred=round(newpredictions$pred.mod)
train.metrics=PerformanceMetrics(Y.pred,Y.HFD45,family='gaussian',isPlot=TRUE)
print(train.metrics)
## Metrics
## Mean.Squared.Error 169.8981481
## Root.Mean.Squared.Error 13.0344984
## Relative.Squared.Error 0.6349802
## Root.Relative.Squared.Error 0.7968565
## Root.Absolute.Error 0.7284701
## Mean.Absolute.Error 10.5648148
##Test Performance Metrics
Xtest1 <- filterOmics$Xtest[[1]] #proteomics
Xtest2 <- filterOmics$Xtest[[2]] #RNASeq
Y.test=as.numeric(Clinical.Test$HFD45)
newpredictions=predict(myresult.HFD45, newdata=Xtest1, newdata2=Xtest2)
Y.pred=round(newpredictions$pred.mod)
test.metrics=PerformanceMetrics(Y.pred,Y.test,family='gaussian',isPlot=TRUE)
print(test.metrics)
## Metrics
## Mean.Squared.Error 202.4166667
## Root.Mean.Squared.Error 14.2273211
## Relative.Squared.Error 0.5790687
## Root.Relative.Squared.Error 0.7609656
## Root.Absolute.Error 0.6681818
## Mean.Absolute.Error 12.2500000
We use results from cvselp() to predict COVID-19 class membership . We obtain predicted probabilities, round these probabilities for class assignment, and obtain classification error.
#Use results from cvselpcca. One can also use selpPredict() to train the model by setting fitselpCCA=NULL.
Y.train=Clinical.Train$DiseaseStatus.Indicator
myresult=selpscca.pred(Xdata1, Xdata2, Y.train,fitselpCCA=mycvselpcca, family="binomial",showProgress=T)
## Fitting SELPCCA Model
## Fitting Prediction Model
Print summary of results.
Results suggest that the first canonical variate for view 1 and view 2 are significantly associated with COVID-19 status (p-value < 0.05). That is, the first canonical variates for views 1 (proteins) and 2 (genes) are able to discriminate between those with and without COVID-19.
print(summary(myresult[["mod.fit"]]))
##
## Call:
## stats::glm(formula = factor(Y) ~ ., family = stats::binomial,
## data = selp.dat)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.1934 2.4013 2.996 0.00274 **
## X1_1 0.7856 0.3128 2.511 0.01203 *
## X1_2 -0.6519 0.4968 -1.312 0.18946
## X2_1 0.9732 0.3945 2.467 0.01362 *
## X2_2 0.2319 0.4886 0.475 0.63502
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 103.500 on 107 degrees of freedom
## Residual deviance: 16.346 on 103 degrees of freedom
## AIC: 26.346
##
## Number of Fisher Scoring iterations: 9
Predict out of sample data and obtain prediction metrics
In this example we obtain predictions based on training data but one can obtain predictions based on testing data, too.
For a binary outcome, we provide the following metrics: “Accuracy”,“Error rate”,“Sensitivity”, “Specificity”, “Matthews Correlation Coefficient (MCC)”, “Balanced Accuracy”,“Balanced Error Rate”, “F1 Score”, “False.Discovery.Rate”, and “Positive Predictive Value”.
For a continuous outcome, we provide the following metrics: “Mean Squared Error”,“Root Mean.Squared Error”,“Relative Squared Error”, “Root Relative Squared.Error”, “Root Absolute Error”, “Mean Absolute Error”.
We provide an option to plot an ROC curve for a binary outcome, and a scatter plot of predicted and observed values, for a continuous outcome.
##Train Performance Metrics
newpredictions=predict(myresult, newdata=Xdata1, newdata2=Xdata2)
Y.pred=round(newpredictions$pred.mod)
train.metrics=PerformanceMetrics(Y.pred,Y.train,family='binomial',isPlot=FALSE)
print(train.metrics)
## Metrics
## Accuracy 0.96296296
## Error.rate 0.03703704
## Sensitivity 0.97727273
## Specificity 0.90000000
## Matthews.Correlation.Coefficient 0.87727273
## Balanced.Accuracy 0.93863636
## Balanced.Error.Rate 0.06136364
## F1.Score 0.97727273
## False.Discovery.Rate 0.02272727
## Positive.Predictive.Value 0.97727273
##Test Performance Metrics
Xtest1 <- filterOmics$Xtest[[1]] #proteomics
Xtest2 <- filterOmics$Xtest[[2]] #RNASeq
Y.test=Clinical.Test$DiseaseStatus.Indicator
newpredictions=predict(myresult, newdata=Xtest1, newdata2=Xtest2)
Y.pred=round(newpredictions$pred.mod)
test.metrics=PerformanceMetrics(Y.pred,Y.test,family='binomial',isPlot=FALSE)
print(test.metrics)
## Metrics
## Accuracy 0.8333333
## Error.rate 0.1666667
## Sensitivity 1.0000000
## Specificity 0.0000000
## Matthews.Correlation.Coefficient 0.0000000
## Balanced.Accuracy 0.5000000
## Balanced.Error.Rate 0.5000000
## F1.Score 0.9090909
## False.Discovery.Rate 0.1666667
## Positive.Predictive.Value 0.8333333
Results from the supervised filtering approach for each data could be visualized via volcano plots using the function volcanoPlot().
volcanoPlot(filterOmics)
The filtered or original data could be visualized with UMAP (uniform manifold approximation and projection) with the function umapPlot(). Here, UMAP can be applied to PCA reduced data as well as the original or filtered data.
umapPlot(filterOmics)
We provide the function VarImportancePlot() to visualize the weights (in absolute value) of the low-dimensional loadings. Since these loadings are standardized to have unit norm, a variable with larger weight contributes more to the association between the views (for SELPCCA) or to the association between the views and discrimination of classes within each view (for SIDA and SIDANet). We only show the top 20 variables and their weights but one can view data matrix for all variables.
Here’s the variable importance tables and plots from SELPCCA.
VarImportancePlot(mycvselpcca)
## View 1 Canonical Correlation Vector
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 1
## 8 1
## 9 1
## 10 1
## Variable Name
## 1 P01715
## 2 P30491;P30685;P30490;P18464;P10319;P30498;P30483;P18463;P18465;P30487;P30488;P30485
## 3 D6W5L6;P07988;H0Y7V6
## 4 P0DP01
## 5 P02765;C9JV77
## 6 P01717
## 7 P06331
## 8 A0A075B6H9
## 9 P02792
## 10 Q6UXB8;Q6UXB8-2
## Loading Absolute Loading Normalized Relative Importance
## 1 0.229 0.229 1.000
## 2 0.223 0.223 0.975
## 3 0.192 0.192 0.839
## 4 0.187 0.187 0.817
## 5 -0.187 0.187 0.815
## 6 0.185 0.185 0.807
## 7 0.181 0.181 0.789
## 8 0.179 0.179 0.782
## 9 0.175 0.175 0.765
## 10 -0.168 0.168 0.734
## View 1 Canonical Correlation Vector
## 88 2
## 89 2
## 90 2
## 91 2
## 92 2
## 93 2
## 94 2
## 95 2
## 96 2
## 97 2
## Variable Name Loading
## 88 P13639 -0.390
## 89 P0DJI9 -0.265
## 90 A0A075B6H9 0.257
## 91 P02766;A0A087WT59;A0A087WV45 0.239
## 92 P02792 0.235
## 93 G3XAP6;P49747;P49747-2;CON__ENSEMBL:ENSBTAP00000006074 -0.212
## 94 P17066 -0.205
## 95 Q5VY30;P02753 0.197
## 96 P06727 0.185
## 97 P04196 0.183
## Absolute Loading Normalized Relative Importance
## 88 0.390 1.000
## 89 0.265 0.679
## 90 0.257 0.658
## 91 0.239 0.612
## 92 0.235 0.603
## 93 0.212 0.544
## 94 0.205 0.526
## 95 0.197 0.506
## 96 0.185 0.476
## 97 0.183 0.470
## View 2 Canonical Vector Variable Name Loading Absolute Loading
## 1 1 UBE2C 0.460 0.460
## 2 1 CDC6 0.408 0.408
## 3 1 DEPDC1B 0.394 0.394
## 4 1 CCNA2 0.293 0.293
## 5 1 CDCA2 0.238 0.238
## 6 1 CDC25C 0.233 0.233
## 7 1 TICRR 0.203 0.203
## 8 1 PCLAF 0.200 0.200
## 9 1 CDK1 0.191 0.191
## 10 1 CDC45 0.146 0.146
## Normalized Relative Importance
## 1 1.000
## 2 0.887
## 3 0.858
## 4 0.637
## 5 0.518
## 6 0.506
## 7 0.441
## 8 0.436
## 9 0.416
## 10 0.318
## View 2 Canonical Vector Variable Name Loading Absolute Loading
## 2574 2 GPR35 0.721 0.721
## 2575 2 UPK3A 0.521 0.521
## 2576 2 IL4 0.369 0.369
## 2577 2 ASGR1 0.195 0.195
## 2578 2 MMP17 0.141 0.141
## 2579 2 ARPIN-AP3S2 0.090 0.090
## 2580 2 GPBAR1 0.061 0.061
## 2581 2 OLFM4 -0.056 0.056
## 2582 2 LTB 0.022 0.022
## 2583 2 A2M 0.000 0.000
## Normalized Relative Importance
## 2574 1.000
## 2575 0.723
## 2576 0.512
## 2577 0.270
## 2578 0.195
## 2579 0.125
## 2580 0.084
## 2581 0.077
## 2582 0.031
## 2583 0.000
## NULL
We provide the function DiscriminantPlots() to plot the discriminant vectors for visualizing class separation. Here’s an example plot with two classes, for the first canonical vector.
#train data
Xdata1 <- filterOmics$X[[1]]
Xdata2 <- filterOmics$X[[2]]
Y=filterOmics$Y
Xdata=list(Xdata1, Xdata2)
hatalpha=list(mycvselpcca$hatalpha[,1],mycvselpcca$hatbeta[,1])
DiscriminantPlots(Xdata, Y,hatalpha,method.used="SELPCCA")
#test data
Xtest1 <- filterOmics$Xtest[[1]] #proteomics
Xtest2 <- filterOmics$Xtest[[2]] #RNASeq
Xtest.in=list(Xtest1, Xtest2)
Ytest.in=Clinical.Test$DiseaseStatus.Indicator
DiscriminantPlots(Xtest.in, Ytest.in, hatalpha,method.used="SELPCCA")
#The classes are well-separated especially in the training set.
We also provide the function CorrelationPlots() for visualizing correlations between estimated canonical vectors vectors. Here’s an example graph with two classes and views moderately correlated. Correlation estimate is given in absolute value.
Xdata1 <- filterOmics$X[[1]]
Xdata2 <- filterOmics$X[[2]]
Y=filterOmics$Y
Xdata=list(Xdata1, Xdata2)
hatalpha=list(mycvselpcca$hatalpha[,1],mycvselpcca$hatbeta[,1])
CorrelationPlots(Xdata, Ytest=Y, hatalpha, method.used="SELPCCA")
#We notice that the views are moderately correlated, and the classes are well-separated.
We provide the function networkPlot() to visualize associations of selected variables between pairs of views. We estimate pairwise similarity matrix using low-dimensional representations of our sparse integrative analysis methods (selpcca, sida, sidanet). We create bipartite graph (bigraph) where variables or nodes from one view are connected to variables or nodes from another view. We construct the bigraph from a pairwise similarity matrix obtained from the outputs of our integrative analysis methods. We estimate the similarity score between a pair of selected variables from two views by calculating the inner product of each selected variable and the sum of canonical variates (for SELPCCA) or discriminant vectors (for SIDA, SIDANet) for the pairs of views. The entries in the similarity matrix is a robust approximation of the Pearson correlation between pairs of variables and the two views under consideration. This network graph has potential to shed light on the complex associations between pairs of views.
Variable pairs with high similarity score may be of interest. The relevance of the associations can be explored by changing the cutoff. This can also be used to reduce the size of the graph, for dense network.
By default, dashed lines indicate negative associations and solid lines represent positive associations.
Here’s the relevance network for results using SELPCCA. We set the correlation cutoff to 0.58 for a less dense network.
networkPlot(mycvselpcca, cutoff=0.58)
#The plot suggests that the protein Immunoglobulin lambda variable 3-1 (UID P01715) is highly positively correlated with many genes (including CDC6, CCNA2, UBE2C), and the protein Alpha-2-HS-glycoprotein (UID P02765) is highly negatively correlated with many genes (including CCNA2 and CDC6).
We provide the function LoadingsPlot() to plot discriminant and canonical correlation vectors. These graphs are useful for visualizing how selected variables from SIDA/SIDANet and SELPCCA contribute to the first and second discriminant (for SIDA and SIDANet) or canonical correlation (for SELPCCA) vectors.
Variables farther from the origin and close to the first or second axis have higher impact on the first or second discriminant/canonical vectors, respectively.
Variables farther from the origin and between both first and second axes have similar higher contributions to the first and second discriminant/canonical correlation vectors.
In both situations, for SIDA and SIDANet, this suggests that these variables contribute more to the separation of classes and association of views.
For SELPCCA, this suggests that these variables contribute more to the association between the two views. This plot can only be generated for classification and association problems with 3 or more classes (SIDA and SIDANet), or for CCA problems with two or more canonical correlation vectors requested (i.e. ncancorr > 1 for SELPCCA).
The angle between two vectors also give an indication of how the two variables are correlated. In particular, vectors that are close to each other suggests that the variables have high positive correlation. Vectors that are about 90 degrees indicate that the two variables are uncorrelated. Vectors that have an angle close to 180 degrees indicate that the two variables have negative correlation.
In order not to clutter the graph, we provide the option keep.loadings for the number of variables with highest absolute weights to show on the graph for each view.
Here’s the loadings plot for results from SELPCCA In this example, we show the top 7 variables with largest absolute weights on the plot for view 1 and top 3 for view 2.
Protein ID A0A075B6H9 appears to be highly correlated with P02792. The gene UBE2C is loaded on the first canonical vector; genes UPK3A and GPR35 are loaded on the second canonical vector.
LoadingsPlots(mycvselpcca,keep.loadings = c(7,3))
#Protein ID A0A075B6H9 appears to be highly correlated with P02792. The gene UBE2C is loaded on the first canonical vector; genes UPK3A and GPR35 are loaded on the second canonical vector.
Biplots are useful for representing both loadings plot and discriminant scores/canonical correlation variates.
We provide the function WithinViewBiplot() to visualize the scores and loadings for each view separately. The protein Immunoglobulin lambda 4-69 (UID A0A075B6H9) appears to be highly correlated with the protein Ferritin light chain (UID P02792) and Immunoglobulin lambda variable 3-1 (UID P01715).
The gene UBE2C is loaded on the first CCA vector, and the genes UPK3A and GPR35 are loaded on the second CCA vector.
The COVID-19 samples are well separated from the non-COVID-19 samples.
Y=Clinical.Train$DiseaseStatus.Indicator
WithinViewBiplot(mycvselpcca,Y,Xtest=NULL,keep.loadings = c(7,3))
Now, instead of visualizing biplots separately for each view, we provide the function BetweenViewBiplot() to graph scores and loadings for pairs of views.
In this graph, dashed red vectors represent loadings plot for the second view. And solid black vectors represent loadings plot for the first view.
The scores are the sum of scores for the two views.
Here’s an example Biplot for the COVID-19 application, where we show the top 7 variables for view 1 and the top 3 for view 2.
The protein Immunoglobulin lambda variable 3-1 (UID P01715) appears to be positively correlated with the gene UBE2C. The protein P02792 appears to be negatively correlated with the genes UPK3A and GPR35.
Y=Clinical.Train$DiseaseStatus.Indicator
BetweenViewBiplot(mycvselpcca,Y,Xtest=NULL, keep.loadings = c(7,3))
SELP Predict Example with train and testing data using simulated data.
#Load Data
data("sidaData")
Xdata <- sidaData[[1]]
Xdata[[1]] <- Xdata[[1]][,1:500]
Xdata[[2]] <- Xdata[[2]][,1:500]
Y.simul <- sidaData[[2]]-1 #Y needs to be 0 or 1 for binomial family
Xtestdata <- sidaData[[3]]
Xtestdata[[1]] <- Xtestdata[[1]][,1:500]
Xtestdata[[2]] <- Xtestdata[[2]][,1:500]
Ytest.simul <- sidaData[[4]]-1
fit.selp <- selpscca.pred(Xdata[[1]], Xdata[[2]], Y.simul,fitselpCCA=NULL,
family="binomial")
## Fitting SELPCCA Model
## [1] "Current iteration is 1"
## [1] "Current iteration is 2"
## [1] "Current iteration is 3"
## [1] "Current iteration is 4"
## [1] "Current iteration is 5"
## [1] "Applying optimal tuning parameter on whole data"
## [1] "Current Iteration Is: 1"
## [1] "Current Iteration Is: 2"
## [1] "Current Iteration Is: 3"
## [1] "Current Iteration Is: 4"
## [1] "Current Iteration Is: 5"
## [1] "Number of non-zero Xdata1 or View 1: 20"
## [1] "Number of non-zero Xdata2 or View 2: 20"
## [1] "Corr(Xdata1*alpha,Xdata2*beta): 0.985"
## [1] "Sparse CCA CovStructure used is: Iden"
## Fitting Prediction Model
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
pred.fit <- predict(fit.selp, newdata=Xtestdata[[1]],
newdata2=Xtestdata[[2]], type = "response")
summary(pred.fit$pred.mod)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.2066 0.4977 1.0000 1.0000
#Performance metrics
Y.pred=round(pred.fit$pred.mod)
test.metrics=PerformanceMetrics(Y.pred,Ytest.simul,family='binomial',isPlot=TRUE)